# pip install plotly
import pandas as pd
import zipfile
import requests
import os
import plotly.io as pio
import numpy as np
import plotly.graph_objs as go
import plotly.offline as offline
import re
import plotly.graph_objs as go
Note, the file for the ubuntu data is large - expect the next cell to take several seconds to run.
import requests
print('Beginning data download.')
url = 'https://edgetier.s3-eu-west-1.amazonaws.com/ubunbu-code-test-data/ubuntu_support_extract.csv.zip'
r = requests.get(url)
with open('./data/ubuntu_support_data.csv.zip', 'wb') as fh:
fh.write(r.content)
# Retrieve HTTP meta-data
print(f"Complete, status: {r.status_code}, content:{r.headers['content-type']}")
Beginning data download. Complete, status: 200, content:application/zip
print ("Unzipping Data")
with zipfile.ZipFile("./data/ubuntu_support_data.csv.zip", 'r') as file:
file.extractall("./data/")
print("Data unzipped!")
os.listdir("./data")
Unzipping Data Data unzipped!
['empty.txt', 'ubuntu_support_data.csv.zip', 'ubuntu_support_extract.csv']
# Load the data into a pandas data frame and print:
data = pd.read_csv("./data/ubuntu_support_extract.csv")
data.head()
| conversation_id | datetime | from | to | text | |
|---|---|---|---|---|---|
| 0 | 10-10000 | 2010-04-17 20:15:00+00:00 | fk91 | NaN | Hello, I have a minimal linux system: how can ... |
| 1 | 10-10000 | 2010-04-17 20:15:00+00:00 | fk91 | NaN | @Maco: ip is there, thanks :) |
| 2 | 10-10000 | 2010-04-17 20:15:00+00:00 | sometux | fk91 | ifconfig |
| 3 | 10-10000 | 2010-04-17 20:15:00+00:00 | sometux | fk91 | static or dhcp |
| 4 | 10-10000 | 2010-04-17 20:16:00+00:00 | fk91 | NaN | static |
data.shape
(3075574, 5)
According to me, top 15 users who were quicker to repond and more responsive should be selected as customer service agent as efficiency and quick response is the key in any customer contact centre. So, the top 15 users that were selected are : sebsebseb,rigved,pjc80la08,chelz,nyqvist,reighnakj,brjannc,aamit1,porg,domedageb,sab1
# Clean the data by removing any missing or invalid values.
data=data.dropna(how='all')
data.shape
(3075574, 5)
data.head(5)
| conversation_id | datetime | from | to | text | |
|---|---|---|---|---|---|
| 0 | 10-10000 | 2010-04-17 20:15:00+00:00 | fk91 | NaN | Hello, I have a minimal linux system: how can ... |
| 1 | 10-10000 | 2010-04-17 20:15:00+00:00 | fk91 | NaN | @Maco: ip is there, thanks :) |
| 2 | 10-10000 | 2010-04-17 20:15:00+00:00 | sometux | fk91 | ifconfig |
| 3 | 10-10000 | 2010-04-17 20:15:00+00:00 | sometux | fk91 | static or dhcp |
| 4 | 10-10000 | 2010-04-17 20:16:00+00:00 | fk91 | NaN | static |
# drop #unbuntu-offtopic
# data = data.loc[~data['text'].str.contains(r'^#ubuntu-offtopic\s*', regex=True, case=False, na=False)]
# print the updated dataframe
# data.head(5)
filtered_df = data.dropna(subset=['text']).loc[data['text'].str.lower().str.contains('ubuntu', case=False, na=False)]
filtered_df['from'] = filtered_df['from'].str.lower()
filtered_df['to'] = filtered_df['to'].str.lower()
filtered_df.head(5)
| conversation_id | datetime | from | to | text | |
|---|---|---|---|---|---|
| 22 | 10-10002 | 2010-04-17 14:48:00+00:00 | tumenjargal | NaN | I'm using firefox 3.5.9 on ubuntu os. |
| 30 | 10-10003 | 2010-11-10 16:26:00+00:00 | redvil | NaN | hello there..anyone knows an iTunes equivalent... |
| 68 | 10-10009 | 2010-04-18 04:32:00+00:00 | ken8521 | nimbiotics | i used the first post in this thread... very s... |
| 92 | 10-10012 | 2010-04-24 15:41:00+00:00 | andra | belus | http://ubuntuforums.org/showthread.php?t=11478 |
| 97 | 10-10012 | 2010-04-24 15:54:00+00:00 | andra | NaN | guys I am currently operating ubuntu on a Gate... |
# after selecting the data containing Ubuntu, the count now reduced to: 306444.
filtered_df.shape
(306444, 5)
# there are users who have posted and have no resposnse which are nAN in to so considering the common between to and from to get 15 top users.
# Count now is reduced is 290751.
users_present = filtered_df['to'].isin(filtered_df['from'])
filtered_df = filtered_df[users_present]
filtered_df.shape
(290751, 5)
# changed the data time format to date data type to get the conversations count.
filtered_df['date'] = pd.to_datetime(filtered_df['datetime']).dt.date
grouped_data = filtered_df.groupby(['conversation_id', 'date','from'])['text'].count().reset_index(name='message_count')
# Sort the data by the number of messages sent in descending order. faster communication and more responsive
grouped_data = grouped_data.sort_values(by=['message_count','date'], ascending=False)
grouped_data = grouped_data.head(15)
grouped_data
| conversation_id | date | from | message_count | |
|---|---|---|---|---|
| 60020 | 219-1 | 2010-02-13 | sebsebseb | 41 |
| 99574 | 338-1 | 2010-05-13 | sebsebseb | 37 |
| 127715 | 433-1 | 2010-12-23 | rigved | 35 |
| 60019 | 219-1 | 2010-02-13 | pjc80la08 | 29 |
| 150875 | 532-1 | 2010-06-28 | sebsebseb | 27 |
| 44018 | 152-4 | 2010-09-18 | chelz | 25 |
| 127716 | 433-1 | 2010-12-24 | rigved | 19 |
| 61706 | 221-2 | 2010-02-17 | nyqvist | 19 |
| 98389 | 315-1 | 2011-07-10 | reighnakj | 18 |
| 150873 | 532-1 | 2010-06-27 | sebsebseb | 18 |
| 28982 | 122-3 | 2011-11-14 | brjannc | 17 |
| 47610 | 165-1 | 2011-10-11 | aamit1 | 17 |
| 100740 | 356-1 | 2011-04-04 | porg | 17 |
| 209831 | 96-6 | 2011-09-25 | domedagen | 16 |
| 28955 | 120-14 | 2010-04-18 | sab1 | 16 |
# Create a bar trace
trace = go.Bar(
x=grouped_data['from'],
y=grouped_data['message_count'],
text=grouped_data['message_count'],
textposition='auto'
)
# Create the layout for the plot
layout = go.Layout(
title='Number of texts by conversation_id and date for each users',
xaxis=dict(title='Users'),
yaxis=dict(title='Number of texts')
)
# Create the figure and display it
fig = go.Figure(data=[trace], layout=layout)
fig.show()
# Total count: 15, minimumn number of messages: 16, maximum number of messages: 41
print(grouped_data.describe())
message_count count 15.000000 mean 23.400000 std 8.475174 min 16.000000 25% 17.000000 50% 19.000000 75% 28.000000 max 41.000000
pio.renderers.default = 'notebook'
# Create a list of traces, one for each user
# Create a new column combining 'conversation_id' and 'date' as a string
grouped_data['conv_date'] = grouped_data['conversation_id'].astype(str) + ' - ' + grouped_data['date'].astype(str)
# Pivot the dataframe so that 'from' columns become columns in the dataframe
pivot_data = grouped_data.pivot_table(index='conv_date', columns='from', values='message_count', fill_value=0)
traces = []
for user in pivot_data.columns:
trace = go.Bar(
x=pivot_data.index,
y=pivot_data[user],
name=user,
hovertemplate='Count: %{y}'
)
traces.append(trace)
# Configure the layout for the plot
layout = go.Layout(
barmode='stack',
title='Number of texts by conversation_id and date for each user',
xaxis=dict(title='Grouped by conversation_id and date'),
yaxis=dict(title='Number of texts'),
)
# Create the Figure and display it
fig = go.Figure(data=traces, layout=layout)
fig.show()
# offline.plot(fig, filename='stacked_bar_plot.html', auto_open=True)
grouped_data
| conversation_id | date | from | message_count | conv_date | |
|---|---|---|---|---|---|
| 60020 | 219-1 | 2010-02-13 | sebsebseb | 41 | 219-1 - 2010-02-13 |
| 99574 | 338-1 | 2010-05-13 | sebsebseb | 37 | 338-1 - 2010-05-13 |
| 127715 | 433-1 | 2010-12-23 | rigved | 35 | 433-1 - 2010-12-23 |
| 60019 | 219-1 | 2010-02-13 | pjc80la08 | 29 | 219-1 - 2010-02-13 |
| 150875 | 532-1 | 2010-06-28 | sebsebseb | 27 | 532-1 - 2010-06-28 |
| 44018 | 152-4 | 2010-09-18 | chelz | 25 | 152-4 - 2010-09-18 |
| 127716 | 433-1 | 2010-12-24 | rigved | 19 | 433-1 - 2010-12-24 |
| 61706 | 221-2 | 2010-02-17 | nyqvist | 19 | 221-2 - 2010-02-17 |
| 98389 | 315-1 | 2011-07-10 | reighnakj | 18 | 315-1 - 2011-07-10 |
| 150873 | 532-1 | 2010-06-27 | sebsebseb | 18 | 532-1 - 2010-06-27 |
| 28982 | 122-3 | 2011-11-14 | brjannc | 17 | 122-3 - 2011-11-14 |
| 47610 | 165-1 | 2011-10-11 | aamit1 | 17 | 165-1 - 2011-10-11 |
| 100740 | 356-1 | 2011-04-04 | porg | 17 | 356-1 - 2011-04-04 |
| 209831 | 96-6 | 2011-09-25 | domedagen | 16 | 96-6 - 2011-09-25 |
| 28955 | 120-14 | 2010-04-18 | sab1 | 16 | 120-14 - 2010-04-18 |
# Group the data by the "from" column to get the number of messages sent by each user.
user_counts = filtered_df.groupby('from').size().reset_index(name='message_count')
# Sort the data by the number of messages sent in descending order.
sorted_users = user_counts.sort_values(by='message_count', ascending=False)
sorted_users.head(5)
| from | message_count | |
|---|---|---|
| 838 | actionparsnip | 3632 |
| 14125 | floodbot1 | 3241 |
| 11323 | dr_willis | 2494 |
| 19254 | ikonia | 2147 |
| 4157 | bazhang | 2138 |
# Select the top 15 users based on the number of messages sent.
top_15_users = sorted_users.head(15)
# top 15 users as per the total message count are:
top_15_users
| from | message_count | |
|---|---|---|
| 838 | actionparsnip | 3632 |
| 14125 | floodbot1 | 3241 |
| 11323 | dr_willis | 2494 |
| 19254 | ikonia | 2147 |
| 4157 | bazhang | 2138 |
| 38406 | sebsebseb | 1691 |
| 21835 | jordan_u | 1491 |
| 14127 | floodbot3 | 1442 |
| 33732 | pici | 1420 |
| 37346 | rww | 1400 |
| 14126 | floodbot2 | 1387 |
| 48991 | zykotick9 | 1182 |
| 22069 | jrib | 1151 |
| 12104 | edbian | 1148 |
| 8363 | coz_ | 1078 |
# Now we have our top 15 users based on the number of messages sent.
# We can also create some summary statistics and visualizations to help validate our selection.
import matplotlib.pyplot as plt
# Summary statistics
print(top_15_users.describe())
# Bar chart of top 15 users
plt.bar(top_15_users['from'], top_15_users['message_count'])
plt.xticks(rotation=90)
plt.xlabel('User')
plt.ylabel('Number of messages')
plt.show()
message_count count 15.00000 mean 1802.80000 std 784.16855 min 1078.00000 25% 1284.50000 50% 1442.00000 75% 2142.50000 max 3632.00000
This is a Python function called has_phatic_expression that takes a sentence as input and checks if the sentence contains one or more phatic expressions. Phatic expressions are typically used to establish social relationships and maintain social harmony, rather than to convey any particular meaning.
The steps performed are as mentioned below:
Testing the scenarios:
def has_phatic_expression(sentence):
phatic_patterns = [
r'\b(hello|hi|hey)\b',
r'\bhow are you\b',
r'\bnice to meet you\b',
r'\bgood morning\b',
r'\bgood afternoon\b',
r'\bgood evening\b',
r'\b(thanks|thank you|cheers)\b',
r'\b(bye|goodbye)\b',
r'\bhave a good day\b',
r'\bnice morning\b'
]
words = sentence.split()
phatic_count = 0
for word in words:
if any(re.search(pattern, word, re.IGNORECASE) for pattern in phatic_patterns):
phatic_count += 1
if phatic_count > 1:
return True
elif phatic_count == 1 and len(re.split(r'[.?!]', sentence.strip())) > 2:
return True
elif phatic_count == len(words):
return True
elif all(re.search(pattern, word, re.IGNORECASE) for pattern in phatic_patterns for word in words):
return True
else:
return False
# Test the function
example_sentence = "Hello, how are you doing today? Can you please help me with my account?"
print(has_phatic_expression(example_sentence)) # Should return True
example_sentence = "Hey,I need help with my account"
print(has_phatic_expression(example_sentence)) # Should return False
example_sentence = "Hello Hello Hello"
print(has_phatic_expression(example_sentence)) # Should return True
example_sentence = "Hello"
print(has_phatic_expression(example_sentence)) # Should return True
True False True True